{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": 11,
   "metadata": {},
   "outputs": [],
   "source": [
    "# !wget https://huggingface.co/datasets/mesolitica/chatgpt-explain-sentiment/resolve/main/explain-sentiment.json"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 34,
   "metadata": {},
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "/home/husein/dev/malaya/malaya/tokenizer.py:214: FutureWarning: Possible nested set at position 3397\n",
      "  self.tok = re.compile(r'({})'.format('|'.join(pipeline)))\n",
      "/home/husein/dev/malaya/malaya/tokenizer.py:214: FutureWarning: Possible nested set at position 3927\n",
      "  self.tok = re.compile(r'({})'.format('|'.join(pipeline)))\n"
     ]
    }
   ],
   "source": [
    "from tqdm import tqdm\n",
    "import json\n",
    "import random\n",
    "import torch\n",
    "import numpy as np\n",
    "from transformers import AutoTokenizer, T5Config\n",
    "from malaya.torch_model.t5 import T5ForSequenceClassification"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 30,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "162895"
      ]
     },
     "execution_count": 30,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "with open('explain-sentiment.json') as fopen:\n",
    "    data = json.load(fopen)\n",
    "    \n",
    "texts, labels = [], []\n",
    "\n",
    "labels_sentiment = ['negative', 'neutral', 'positive']\n",
    "for d in data:\n",
    "    try:\n",
    "        l = json.loads(d[1])\n",
    "        if not isinstance(l['sentiment'], str):\n",
    "            continue\n",
    "        if l['sentiment'] not in labels_sentiment:\n",
    "            continue\n",
    "        if not len(d[0]):\n",
    "            continue\n",
    "        texts.append(d[0])\n",
    "        labels.append(labels_sentiment.index(l['sentiment']))\n",
    "    except:\n",
    "        pass\n",
    "    \n",
    "len(texts)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 43,
   "metadata": {
    "scrolled": true
   },
   "outputs": [
    {
     "data": {
      "text/plain": [
       "[('Terimakasih ya', 2),\n",
       " ('Enjin klik trus scroll ini yu', 1),\n",
       " ('Karya tangan anak bangsa sekarang patut diacungi jempol apalagi produknya yang yang bagus, banggalah kita memakai k',\n",
       "  2),\n",
       " ('BABUN PUNYA AMZAR. TAK RETI HORMAT ORANG KEEEEEE???!!!!', 0),\n",
       " ('kau tak payah bagi data ni pukimak!! wahai pu', 0),\n",
       " ('Mau nikah ni biasanya', 1),\n",
       " ('Awak ada personal issue dengan Klang ke?', 1),\n",
       " ('Batu bersurat kuala berang tahun 1303.', 1),\n",
       " ('Dah tak tahu nak buat apa dengan info ni.', 0),\n",
       " ('hati hati ya semua. Kami .I anti main sentimen politik buka aib. Jual maruah manusia....I alert semua',\n",
       "  0)]"
      ]
     },
     "execution_count": 43,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "list(zip(texts[:10], labels[:10]))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 31,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "(array([0, 1, 2]), array([63996, 66104, 32795]))"
      ]
     },
     "execution_count": 31,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "np.unique(labels, return_counts = True)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 32,
   "metadata": {},
   "outputs": [],
   "source": [
    "from sklearn.model_selection import train_test_split\n",
    "\n",
    "train_X, test_X, train_Y, test_Y = train_test_split(\n",
    "    texts, labels, test_size = 0.1\n",
    ")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 36,
   "metadata": {},
   "outputs": [],
   "source": [
    "config = T5Config.from_pretrained('mesolitica/nanot5-small-malaysian-cased')\n",
    "config.num_labels = len(set(labels))\n",
    "config.vocab = labels_sentiment"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 37,
   "metadata": {},
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "Some weights of T5ForSequenceClassification were not initialized from the model checkpoint at mesolitica/nanot5-small-malaysian-cased and are newly initialized: ['classifier.weight', 'classifier.bias']\n",
      "You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.\n"
     ]
    }
   ],
   "source": [
    "model = T5ForSequenceClassification.from_pretrained('mesolitica/nanot5-small-malaysian-cased', config = config)\n",
    "_ = model.cuda()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 38,
   "metadata": {},
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "Loading the tokenizer from the `special_tokens_map.json` and the `added_tokens.json` will be removed in `transformers 5`,  it is kept for forward compatibility, but it is recommended to update your `tokenizer_config.json` by uploading it again. You will see the new `added_tokens_decoder` attribute that will store the relevant information.\n",
      "Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.\n"
     ]
    }
   ],
   "source": [
    "tokenizer = AutoTokenizer.from_pretrained('mesolitica/nanot5-base-malaysian-cased')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 39,
   "metadata": {},
   "outputs": [],
   "source": [
    "trainable_parameters = [param for param in model.parameters() if param.requires_grad]\n",
    "trainer = torch.optim.AdamW(trainable_parameters, lr = 2e-4)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 59,
   "metadata": {},
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "100%|█████████████████████████████████████████████████████████████████████████████████████████████████████████| 9163/9163 [05:00<00:00, 30.45it/s]\n"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "epoch: 0, loss: 0.4947291410679668, dev_predicted: 0.6881133464180569\n"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "100%|█████████████████████████████████████████████████████████████████████████████████████████████████████████| 9163/9163 [05:11<00:00, 29.43it/s]\n"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "epoch: 1, loss: 0.4765375747912435, dev_predicted: 0.6843719332679097\n"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "100%|█████████████████████████████████████████████████████████████████████████████████████████████████████████| 9163/9163 [05:09<00:00, 29.64it/s]\n"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "epoch: 2, loss: 0.4514005017429948, dev_predicted: 0.6857826300294406\n"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "100%|█████████████████████████████████████████████████████████████████████████████████████████████████████████| 9163/9163 [04:21<00:00, 35.05it/s]\n"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "epoch: 3, loss: 0.42360373258100387, dev_predicted: 0.6813052011776252\n"
     ]
    }
   ],
   "source": [
    "batch_size = 16\n",
    "epoch = 100\n",
    "\n",
    "best_dev_acc = -np.inf\n",
    "patient = 3\n",
    "current_patient = 0\n",
    "\n",
    "for e in range(epoch):\n",
    "    pbar = tqdm(range(0, len(train_X), batch_size))\n",
    "    losses = []\n",
    "    for i in pbar:\n",
    "        trainer.zero_grad()\n",
    "        x = train_X[i: i + batch_size]\n",
    "        y = np.array(train_Y[i: i + batch_size])\n",
    "        \n",
    "        padded = tokenizer(x, padding = 'longest', return_tensors = 'pt')\n",
    "        padded['labels'] = torch.from_numpy(y)\n",
    "        for k in padded.keys():\n",
    "            padded[k] = padded[k].cuda()\n",
    "            \n",
    "        loss, pred = model(**padded)\n",
    "        loss.backward()\n",
    "        \n",
    "        grad_norm = torch.nn.utils.clip_grad_norm_(trainable_parameters, 5.0)\n",
    "        trainer.step()\n",
    "        losses.append(float(loss))\n",
    "        \n",
    "    dev_predicted = []\n",
    "    for i in range(0, len(test_X), batch_size):\n",
    "        x = test_X[i: i + batch_size]\n",
    "        y = np.array(test_Y[i: i + batch_size])\n",
    "        padded = tokenizer(x, padding = 'longest', return_tensors = 'pt')\n",
    "        padded['labels'] = torch.from_numpy(y)\n",
    "        for k in padded.keys():\n",
    "            padded[k] = padded[k].cuda()\n",
    "        \n",
    "        loss, pred = model(**padded)\n",
    "        dev_predicted.append((pred.argmax(axis = 1).detach().cpu().numpy() == y).mean())\n",
    "        \n",
    "    dev_predicted = np.mean(dev_predicted)\n",
    "    \n",
    "    print(f'epoch: {e}, loss: {np.mean(losses)}, dev_predicted: {dev_predicted}')\n",
    "    \n",
    "    if dev_predicted >= best_dev_acc:\n",
    "        best_dev_acc = dev_predicted\n",
    "        current_patient = 0\n",
    "        model.save_pretrained('small')\n",
    "    else:\n",
    "        current_patient += 1\n",
    "    \n",
    "    if current_patient >= patient:\n",
    "        break"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 60,
   "metadata": {},
   "outputs": [],
   "source": [
    "padded = tokenizer(texts[:10], padding = True, return_tensors = 'pt')\n",
    "for k in padded.keys():\n",
    "    padded[k] = padded[k].cuda()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 61,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "array([2, 1, 2, 0, 0, 1, 1, 1, 1, 0])"
      ]
     },
     "execution_count": 61,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "model(**padded)[0].detach().cpu().numpy().argmax(axis = 1)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 62,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "[2, 1, 2, 0, 0, 1, 1, 1, 0, 0]"
      ]
     },
     "execution_count": 62,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "labels[:10]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 63,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "['Terimakasih ya',\n",
       " 'Enjin klik trus scroll ini yu',\n",
       " 'Karya tangan anak bangsa sekarang patut diacungi jempol apalagi produknya yang yang bagus, banggalah kita memakai k',\n",
       " 'BABUN PUNYA AMZAR. TAK RETI HORMAT ORANG KEEEEEE???!!!!',\n",
       " 'kau tak payah bagi data ni pukimak!! wahai pu',\n",
       " 'Mau nikah ni biasanya',\n",
       " 'Awak ada personal issue dengan Klang ke?',\n",
       " 'Batu bersurat kuala berang tahun 1303.',\n",
       " 'Dah tak tahu nak buat apa dengan info ni.',\n",
       " 'hati hati ya semua. Kami .I anti main sentimen politik buka aib. Jual maruah manusia....I alert semua']"
      ]
     },
     "execution_count": 63,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "texts[:10]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 64,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "['negative', 'neutral', 'positive']"
      ]
     },
     "execution_count": 64,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "labels_sentiment"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 65,
   "metadata": {},
   "outputs": [],
   "source": [
    "model_ = T5ForSequenceClassification.from_pretrained('small')\n",
    "_ = model_.cuda()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 66,
   "metadata": {},
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "100%|████████████████████████████████████████████████████████████████████████████████████████████████████████| 1019/1019 [00:08<00:00, 119.27it/s]\n"
     ]
    }
   ],
   "source": [
    "real_Y = []\n",
    "for i in tqdm(range(0, len(test_X), batch_size)):\n",
    "    x = test_X[i: i + batch_size]\n",
    "    y = np.array(test_Y[i: i + batch_size])\n",
    "    padded = tokenizer(x, padding = 'longest', return_tensors = 'pt')\n",
    "    padded['labels'] = torch.from_numpy(y)\n",
    "    for k in padded.keys():\n",
    "        padded[k] = padded[k].cuda()\n",
    "\n",
    "    loss, pred = model(**padded)\n",
    "    real_Y.extend(pred.argmax(axis = 1).detach().cpu().numpy().tolist())"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 67,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "              precision    recall  f1-score   support\n",
      "\n",
      "    negative    0.71272   0.73407   0.72323      6276\n",
      "     neutral    0.66692   0.66672   0.66682      6628\n",
      "    positive    0.64844   0.61282   0.63012      3386\n",
      "\n",
      "    accuracy                        0.68146     16290\n",
      "   macro avg    0.67602   0.67120   0.67339     16290\n",
      "weighted avg    0.68072   0.68146   0.68093     16290\n",
      "\n"
     ]
    }
   ],
   "source": [
    "from sklearn import metrics\n",
    "\n",
    "print(\n",
    "    metrics.classification_report(\n",
    "        real_Y, test_Y, target_names = labels_sentiment,\n",
    "        digits = 5\n",
    "    )\n",
    ")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 68,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "application/vnd.jupyter.widget-view+json": {
       "model_id": "f6c9f79142174bd4ac8942da1006aaf2",
       "version_major": 2,
       "version_minor": 0
      },
      "text/plain": [
       "model.safetensors:   0%|          | 0.00/167M [00:00<?, ?B/s]"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    },
    {
     "data": {
      "text/plain": [
       "CommitInfo(commit_url='https://huggingface.co/mesolitica/sentiment-analysis-nanot5-small-malaysian-cased/commit/d083efe7f8cd67a1ed82b41c58a9d5784a9f2f91', commit_message='Upload T5ForSequenceClassification', commit_description='', oid='d083efe7f8cd67a1ed82b41c58a9d5784a9f2f91', pr_url=None, pr_revision=None, pr_num=None)"
      ]
     },
     "execution_count": 68,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "model_.push_to_hub('mesolitica/sentiment-analysis-nanot5-small-malaysian-cased', safe_serialization = True)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 69,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "CommitInfo(commit_url='https://huggingface.co/mesolitica/sentiment-analysis-nanot5-small-malaysian-cased/commit/83c27565b162b115d7deae8758032101fe4e3dde', commit_message='Upload tokenizer', commit_description='', oid='83c27565b162b115d7deae8758032101fe4e3dde', pr_url=None, pr_revision=None, pr_num=None)"
      ]
     },
     "execution_count": 69,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "tokenizer.push_to_hub('mesolitica/sentiment-analysis-nanot5-small-malaysian-cased', safe_serialization = True)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": []
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3 (ipykernel)",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.8.10"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 4
}
